######################
#nss_working
#paul stainier
#started 12/22/2019
#process the nss data and create three keys using round_num 59-64, 66, 68:
#1. household 
#2. individual 
#3. food item
#merge the household with the district crosswalk
#
#####################

library(plyr)
library(dplyr)
library(data.table)
library(readr)
rm(list = ls())

#########################
#toggle T/F for the part that 
#you want to run 
#########################
household <- T
individual <- T
item <- T
household_10 <- T
individual_10 <- T
activity_10 <- T


#########################
#replace with own directories before running
#########################
input_path <- 
input_path10 <- 
output_directory <- 
crosswalk_directory <- 



###########################
#household
#make dataset of household-level variables
#for NSS round_nums 59 through 64, 66, and 68
##########################
if(household){
  for(round_num in c(59:64, 66, 68)){
    print(round_num)
    setwd(paste(input_path, round_num, sep = "/"))
    household_data <- read_csv(paste("nss_", round_num, "_house.csv", sep = ""))
    #schedule type 2 has a reference period of 7 days for some foods
    if(round_num == 60){
      household_data <- household_data[which(household_data$schedule_type == 1),]
    }
    household_data$round = round_num
    household_data$schedule = 1
    setnames(household_data, old = "land_possessed", new = "land_owned")
    if("non_household_member_meals" %in% names(household_data) == FALSE){
      household_data$non_household_member_meals <- 0 
    }
    if("nic" %in% names(household_data) == FALSE){
      household_data$nic <- 0 
    }
    if("nco" %in% names(household_data) == FALSE){
      household_data$nco <- 0 
    }
    if(round_num < 64 | round_num == 68){
      setnames(household_data, old = "weight_comb", new = "weight")
    }
    if(round_num == 64 | round_num == 66){
      household_data <- mutate(household_data, 
                               district = as.numeric(district))
      household_data$district = household_data$district - floor(household_data$district/100)*100
    }
    
    household_data$nic <- as.numeric(as.character(household_data$nic))
    id_data <- read_csv(paste("nss_", round_num, "_id.csv", sep = ""))
    #schedule type 2 has a reference period of 7 days for some foods
    if(round_num == 60){
      id_data <- id_data[which(id_data$schedule_type == 1),]
    }
    id_data <- dplyr::select(id_data, 
                      house_id, surv_date)
    household_data <- merge(household_data, id_data, by = c("house_id"))
    if(round_num > 65){
      colnames(household_data)[colnames(household_data) == 'mpce_mrp'] <- 'mpce_30'
    }
    household_data <- dplyr::select(household_data, 
                             house_id, state_region, state, district,
                             round, schedule, surv_date, sector, mpce_30, house_size, house_type,
                             religion, social_group, land_owned,cook_code, light_code, 
                             non_household_member_meals, nic, nco, weight)
    
    household_data <- mutate(household_data, 
                             surv_date = as.numeric(surv_date),
                             state_region = as.numeric(state_region))
    household_data$year <- 2000 + household_data$surv_date %% 100
    household_data$month <-(household_data$surv_date %%10000 - household_data$surv_date %% 100)/100 
    setwd(crosswalk_directory)

    
    if(round_num == 59){
      household_data_full <- household_data
    }else{
      household_data_full <- rbind(household_data_full, household_data)
    }
  }
  
  district_crosswalk <- read_csv("nss_district_crosswalk_all.csv")
  
  household_data_full$state <- floor(household_data_full$state_region / 10 )
  household_data_full <- merge(household_data_full, district_crosswalk, by = c("state", "district", "round"))
  setwd(output_directory)
  write_csv(household_data_full, "nss_5968all_household_key.csv")
}

###########################
#individual
#make dataset of individual-level variables
#for NSS round_nums 59 through 64, 66, and 68
##########################
if(individual){
  for(round_num in c(59:64, 66, 68)){
    print(round_num)
    setwd(paste(input_path, round_num, sep = "/"))
    individual_data <- read_csv(paste("nss_", round_num, "_individual.csv", sep = ""))
    #schedule type 2 has a reference period of 7 days for some foods
    if(round_num == 60){
      individual_data <- individual_data[which(individual_data$schedule_type == 1),]
    }
    individual_data$round = round_num
    individual_data$schedule = 1
    if(round_num < 64 | round_num == 68){
      setnames(individual_data, old = "weight_comb", new = "weight")
    }
    individual_data <- dplyr::select(individual_data, 
                              person_num, house_id, round, schedule, relation_to_head,
                               sex, age, marital_stat, educ, days_away, 
                               meals_per_day, meals_school, meals_empl, 
                               meals_other, meals_pay, meals_home, weight)
    
    individual_data$educ <- as.numeric(as.character(individual_data$educ))
    print(table(individual_data$educ))
    print(typeof(individual_data$educ))
    if(round_num == 59){
      individual_data_full <- individual_data
    }else{
      individual_data_full <- rbind(individual_data_full, individual_data)
    }
    print(table(individual_data_full$educ[which(individual_data_full$round == round_num)]))
    setwd(output_directory)
    write_csv(individual_data_full, "nss_5968all_individual_key.csv")
  }
}

###########################
#item
#make dataset of item-level variables
#for NSS round_nums 59 through 64, 66, and 68
##########################
if(item){
  for(round_num in c(59:64, 66, 68)){
    print(round_num)
    setwd(paste(input_path, round_num, sep = "/"))
    item_data <- read_csv(paste("nss_", round_num, "_item.csv", sep = ""))
    item_data$round = round_num
    item_data$schedule = 1
    #print(table(item_data$source_code))
    item_data$source_code <- as.numeric(item_data$source_code)
    #print(table(item_data$source_code))
    if(round_num < 64 | round_num == 68){
      setnames(item_data, old = "weight_comb", new = "weight")
    }
    if(round_num < 61 | round_num == 62 | round_num == 63 | round_num == 64){
      item_data$home_prod_qty <- ifelse(item_data$source_code == 2, item_data$total_cons_qty, 0)
      item_data$home_prod_val <- ifelse(item_data$source_code == 2, item_data$total_cons_val, 0)
      item_data$home_prod_qty[which(item_data$source_code == 3)] <- item_data$total_cons_qty[which(item_data$source_code == 3)]/2
      item_data$home_prod_val[which(item_data$source_code == 3)] <- item_data$total_cons_val[which(item_data$source_code == 3)]/2
    }
   
    
    if(round_num > 65){
      item_data <- item_data[which(item_data$schedule_type == 1),]
    }
    
    if(round_num == 60){
      #schedule type 2 has a reference period of 7 days for some foods
      item_data <- item_data[which(item_data$schedule_type == 1),]
      item_data2 <- read_csv(paste("nss_", round_num, "_item2.csv", sep = ""))
      item_data2 <- item_data2[which(item_data2$schedule_type == 1),]
      item_data2$round = round_num
      item_data2$schedule = 1
      setnames(item_data2, old = "weight_comb", new = "weight")
      
      item_data2$home_prod_qty <- ifelse(item_data2$source_code == 2, item_data2$total_cons_qty, 0)
      item_data2$home_prod_val <- ifelse(item_data2$source_code == 2, item_data2$total_cons_val, 0)
      item_data2$home_prod_qty[which(item_data2$source_code == 3)] <- item_data2$total_cons_qty[which(item_data2$source_code == 3)]/2
      item_data2$home_prod_val[which(item_data2$source_code == 3)] <- item_data2$total_cons_val[which(item_data2$source_code == 3)]/2
      item_data <- rbind(item_data, item_data2)
    }
    item_data <- dplyr::select(item_data, 
                        house_id, round, schedule, item_code, 
                       home_prod_qty, home_prod_val, 
                       total_cons_qty, total_cons_val, source_code)
    if(round_num == 59){
      item_data_full <- item_data
    }else{
      item_data_full <- rbind(item_data_full, item_data)
    }
  }
  setwd(output_directory)
  write_csv(item_data_full, "nss_5968all_item_key.csv")
}


###########################
#household_10
#make dataset of household-level variables
#for NSS thick round_nums 59 through 64, 66, and 68
##########################
if(household_10){
  for(round_num in c(60:62, 64, 66, 68)){
    print(round_num)
    setwd(paste(input_path10, round_num, sep = "/"))
    if(round_num != 61){
      household_data <- read_csv(paste("nss10_round", round_num, "_house.csv", sep = ""))
      id_data <- read_csv(paste("nss10_round", round_num, "_id.csv", sep = ""))
      id_data <- dplyr::select(id_data, 
                        house_id, surv_date)
      if(round_num == 68){
        household_data$house_id <- as.numeric(paste(household_data$fsu, household_data$hamlet_num, household_data$ss_stratum, household_data$house_num, sep = ""))
      }
      household_data <- merge(household_data, id_data, by = c("house_id"))
    }else{
      household_data <- read_csv(paste("nss10_round", round_num, "_id.csv", sep = ""))
    }
    
    household_data <- mutate(household_data, 
                             round = round_num,
                             schedule = 10)
    
    if(round_num <=64){
      setnames(household_data, old = "weight_comb", new = "weight")
    }
    if(round_num == 68){
      setnames(household_data, old = "mult_comb", new = "weight")
    }
    #if(round_num == 64 | round_num == 66){
    #  household_data$district = household_data$district - floor(household_data$district/100)*100
    #}
    
    household_data$nic <- as.numeric(as.character(household_data$nic))
    household_data <- dplyr::select(household_data, house_id, round, schedule, sector, state, district, house_size,
                                                       nic, nco, house_type, religion, social_group, land_possessed,
                                                       weight, surv_date)
    
    household_data <- mutate(household_data, 
                             surv_date = as.numeric(surv_date),
                             year = 2000 + surv_date %% 100,
                             month = (surv_date %%10000 - surv_date %% 100 )/100)
    
    
    if(round_num == 60){
      household_data_full <- household_data
    }else{
      household_data_full <- rbind(household_data_full, household_data)
    }
  }
  household_data_full <- mutate(household_data_full, 
                                state = as.numeric(state), 
                                district = as.numeric(district),
                                round = as.numeric(round))
  
  setwd(crosswalk_directory)
  district_crosswalk <- read_csv("nss_district_crosswalk_all.csv")
  household_data_full <- merge(household_data_full, district_crosswalk, by = c("state", "district", "round"), all.x = T)
  setwd(output_directory)
  write_csv(household_data_full, "nss10_6068_household_key.csv")
}


###########################
#individual_10
#make dataset of household-level variables
#for NSS round_nums 60, 61, 62, 64, 66, and 68
##########################
if(individual_10){
  for(round_num in c(60:62, 64, 66, 68)){
    print(round_num)
    setwd(paste(input_path10, round_num, sep = "/"))
    if(round_num == 60){
      individual_data <- read_csv(paste("nss10_round", round_num, "_job.csv", sep = ""))
      unemp_data <- read_csv(paste("nss10_round", round_num, "_unemp.csv", sep = ""))
      individual_data <- dplyr::select(individual_data, person_id, house_id, person_num, relation_to_head, sex, age, marital_stat, educ_gen,
                                                           educ_tech, principal_status, principal_nic, principal_nco, sub_dummy, sub_status, 
                                                           sub_nic, sub_nco)
      unemp_data <- dplyr::select(unemp_data, 
                           person_id, unemp_duration, employed_ever)
      individual_data <- merge(individual_data, unemp_data, by="person_id", all.x = T)
    }
    if(round_num == 61 | round_num == 66| round_num == 68){
      individual_data <- read_csv(paste("nss10_round", round_num, "_educ.csv", sep = ""))
      princ_data <- read_csv(paste("nss10_round", round_num, "_job.csv", sep = ""))
      sub_data <- read_csv(paste("nss10_round", round_num, "_sub_job.csv", sep = ""))
      unemp_data <- read_csv(paste("nss10_round", round_num, "_unemp.csv", sep = ""))
      if(round_num == 68){
        individual_data$person_id <- paste(individual_data$house_id, individual_data$person_num, sep = "")
        princ_data$person_id <- paste(princ_data$house_id, princ_data$person_num, sep = "")
        sub_data$person_id <- paste(sub_data$house_id, sub_data$person_num, sep = "")
        unemp_data$person_id <- paste(unemp_data$house_id, unemp_data$person_num, sep = "")
        unemp_data <- dplyr::select(unemp_data, 
                             person_id, unemp_duration)
        unemp_data$employed_ever <- NA
      }else{
        unemp_data <- dplyr::select(unemp_data, 
                             person_id, unemp_duration, employed_ever)
      }
      individual_data <- dplyr::select(individual_data, 
                                person_id, house_id, person_num, relation_to_head, sex, age, marital_stat, educ_gen,
                                educ_tech)
      princ_data <- dplyr::select(princ_data,
                        person_id, principal_status, principal_nic, principal_nco, sub_dummy)
      sub_data <- dplyr::select(sub_data, 
                         person_id, sub_status, sub_nic, sub_nco)
      
      
      individual_data <- merge(individual_data, princ_data, by = "person_id", all.x = T)
      individual_data <- merge(individual_data, sub_data, by = "person_id", all.x = T)
      individual_data <- merge(individual_data, unemp_data, by = "person_id", all.x = T)
    }
    if(round_num == 62){
      individual_data <- read_csv(paste("nss10_round", round_num, "_job.csv", sep = ""))
      educ_data <- read_csv(paste("nss10_round", round_num, "_educ.csv", sep = ""))
      individual_data <- dplyr::select(individual_data, 
                                person_id,  principal_status, principal_nic, principal_nco, sub_dummy, sub_status, 
                                sub_nic, sub_nco)
      educ_data <- dplyr::select(educ_data, 
                          person_id, house_id, person_num, relation_to_head, sex, age, marital_stat, educ_gen,
                          educ_tech)
      individual_data <- merge(individual_data, educ_data, by = "person_id", all.x = T)
      individual_data$unemp_duration <- NA
      individual_data$employed_ever <- NA
    }
    if(round_num == 64){
      individual_data <- read_csv(paste("nss10_round", round_num, "_job.csv", sep = ""))
      individual_data <- dplyr::select(individual_data, 
                                     person_id, house_id, person_num, relation_to_head, sex, age, marital_stat, educ_gen,
                                     educ_tech, principal_status, principal_nic, principal_nco, sub_dummy, sub_status, 
                                     sub_nic, sub_nco)
      individual_data$unemp_duration <- NA
      individual_data$employed_ever <- NA
    }
    
    individual_data <- mutate(individual_data, 
                              principal_nic = as.numeric(as.character(principal_nic)),
                              person_id2 = as.numeric(paste(house_id, person_num, sep = "")),
                              round = round_num)
    

    if(round_num == 60){
      individual_data_full <- individual_data
    }else{
      individual_data_full <- rbind(individual_data_full, individual_data)
    }
  }
  setwd(output_directory)
  write_csv(individual_data_full, "nss10_6068_individual_key.csv")
}

###########################
#activity_10
#make dataset of household-level variables
#for NSS thick round_nums 59 through 64, 66, and 68
##########################
if(activity_10){
  for(round_num in c(60:62, 64, 66, 68)){
    print(round_num)
    setwd(paste(input_path10, round_num, sep = "/"))
    activity_data <- read_csv(paste("nss10_round", round_num, "_wage.csv", sep = ""))
    #these round_nums don't have their own unemployed section, instead we will use the activities
    if(round_num == 62 | round_num == 64){
      activity_data$unemployed <- NA
    }
    activity_data <- dplyr::select(activity_data,
                              house_id, person_num, intensity_1,
                              intensity_2, intensity_3, intensity_4, intensity_5,
                              intensity_6, intensity_7, wage_cash, wage_kind, wage_total, week_status,
                              week_nic, week_nco, unemployed, current_day_status, current_day_nic)
    
    activity_data <- mutate(activity_data, 
                            person_id2 = paste(house_id, person_num, sep = ""),
                            week_nic = as.numeric(as.character(week_nic)),
                            current_day_nic = as.numeric(as.character(current_day_nic)), 
                            round = round_num)
    
    if(round_num == 60){
      activity_data_full <- activity_data
    }else{
      activity_data_full <- rbind(activity_data_full, activity_data)
    }
  }  
  setwd(output_directory)
  write_csv(activity_data_full, "nss10_6068_activity_key.csv")
}

